
library(dplyr)
library(haven)
library(readr)
library(assertr)
library(stringr)
library(tidyr)

setwd('./NC Data')
rawdata_path = 'raw_data_from_ncerdc'

## Read raw data for years 2008 to 2016
PAY = setNames(vector(mode="list", length=9), 2008:2016)
PERS = setNames(vector(mode="list", length=6), 2008:2013)

for (yr in 2008:2016){
  
  cat('Converting .dta file for year',yr,'\n')

  if (yr %in% 2008:2011){
    pay_file = sprintf('lspaysnap%s.dta',yr)
  } else if (yr %in% 2012:2016){
    pay_file = sprintf('certsalpub%s.dta',yr)
  }
  
  PAY[[paste(yr)]] = read_dta(file.path(rawdata_path,pay_file),encoding="latin1") %>%
    filter(!is.na(teachid)) %>%
    rename(any_of(c(year = 'fisyr', pay_lvl_cd = 'pay_level'))) %>%
    mutate(across(c("year","teachid"),as.numeric)) %>%
    bind_rows(tibble(tchr_exp = numeric(), ethnic_cd = character(), gender = character())) %>%
    select(teachid,pay_lvl_cd,tchr_exp,year,ethnic_cd,gender)

  if (yr %in% 2008:2013){
    pers_file = sprintf('pers%s.dta',yr)
    
    PERS[[paste(yr)]] = read_dta(file.path(rawdata_path,pers_file),encoding="latin1") %>%
      filter(!is.na(teachid)) %>%
      rename(any_of(c(year = 'fiscyr'))) %>%
      mutate(across(c("year","teachid"),as.numeric)) %>%
      select(teachid,year,tch_sex,tch_eth)
  }

}   
    
PAY = bind_rows(PAY)
PERS = bind_rows(PERS)

## Functions
mode_or_last = function(x){
  y = DescTools::Mode(x, na.rm = TRUE)
  if (is.na(y[1])){
    y = last(x[!is.na(x)])
  } else if (length(y)>1){
    y = last(x[x %in% y])
  } else {
    y = y[1]
  }
  return(y)
}

exp_str_pat = '(?<=^(A[\\sS]|M[\\sDS]|3[\\sDHS]))\\d{2}(?=(\\s+[NM]|)$)'

PAY = PAY %>%
  mutate(pay_lvl_exp = str_extract(pay_lvl_cd,exp_str_pat),
         pay_lvl_exp = as.numeric(pay_lvl_exp),
         pay_lvl_exp = replace(pay_lvl_exp,pay_lvl_cd=='3H',0),
         tch_exp = case_when(!is.na(tchr_exp) ~ tchr_exp,
                             !is.na(pay_lvl_exp) ~ pay_lvl_exp + 1),
         across(c('gender','ethnic_cd'),~ replace(.x,.x=='',NA))) %>%
  group_by(teachid, year) %>%
  summarize(tch_exp = mode_or_last(tch_exp),
            ethnic_cd = mode_or_last(ethnic_cd),
            gender = mode_or_last(gender), .groups = 'drop') %>%
  arrange(teachid,year) %>%
  group_by(teachid) %>%
  mutate(tch_exp_obs = replace(tch_exp, is.na(tch_exp), NA),
         year_obs = replace(year, is.na(tch_exp), NA)) %>%
  fill(year_obs,tch_exp_obs, .direction = "updown") %>%
  ungroup() %>%
  mutate(tch_exp = case_when(!is.na(tch_exp) ~ tch_exp,
                             TRUE ~ tch_exp_obs + (year - year_obs)),
         tch_exp = replace(tch_exp, tch_exp<0, 0))
         
DEMOG = PERS %>%
  group_by(teachid, year) %>%
  mutate(across(c('tch_sex', 'tch_eth'),~ replace(.x, .x=='', NA))) %>%
  summarize(tch_sex = mode_or_last(tch_sex),
            tch_eth = mode_or_last(tch_eth), .groups = 'drop') %>%
  full_join(select(PAY, teachid, year, ethnic_cd, gender), by = c('year','teachid')) %>%
  mutate(tch_sex = case_when(!is.na(tch_sex) ~ tch_sex,
                             TRUE ~ gender),
         tch_eth = case_when(!is.na(tch_eth) ~ tch_eth,
                             TRUE ~ ethnic_cd)) %>%
  arrange(teachid,year) %>%
  group_by(teachid) %>%
  summarize(tch_sex = mode_or_last(tch_sex),
            tch_eth = mode_or_last(tch_eth), .groups = 'drop') %>%
  select(teachid, tch_sex, tch_eth)  
  
data = read_rds('student_all.rds') %>%
  pivot_longer(cols = c('teachid_math','teachid_reading'), values_to = 'teachid') %>%
  filter(!is.na(teachid)) %>%
  distinct(teachid,year) %>%
  left_join(DEMOG, by = 'teachid') %>%
  left_join(select(PAY, teachid, year, tch_exp), by = c('teachid','year'))

write_rds(data,'teacher_all.rds')
